And just for fun…

from google.cloud import storage
import pandas as pd
import io
import os
import gzip
import plotly.express as px
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[1], line 1
----> 1 from google.cloud import storage
      2 import pandas as pd
      3 import io

ModuleNotFoundError: No module named 'google'
service_account_id = 'elijahsandler@net-data-viz-handbook.iam.gserviceaccount.com'

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'C:\\Users\\elija\\Documents\\24f-coop\\net-data-viz-handbook-fe2c5531555d.json'
# Initialize a GCS client
client = storage.Client()
print('1 ')
# Specify your bucket name and the specific .csv.gz file you want
bucket_name = 'gs_net-data-viz-handbook'
file_name = 'sample/sample_SIR_0_countries_incidence_daily.csv.gz'  # Update this to the specific file name
meta_file = 'sample/sample_SIR_0_meta.csv.gz'
print('2 ')
# Get the bucket and blob
bucket = client.get_bucket(bucket_name)
blob = bucket.blob(file_name)
metablob = bucket.blob(meta_file)
print('3 ')

# Download the .csv.gz file as bytes
compressed_content = blob.download_as_bytes()
print('4 ')
# Decompress the .csv.gz content
with gzip.GzipFile(fileobj=io.BytesIO(compressed_content)) as gz:
    # Read the decompressed content into a pandas DataFrame
    df = pd.read_csv(gz)
    
# Download the .csv.gz file as bytes
compressed_content = metablob.download_as_bytes()
print('5 ')
# Decompress the .csv.gz content
with gzip.GzipFile(fileobj=io.BytesIO(compressed_content)) as gz:
    # Read the decompressed content into a pandas DataFrame
    df_meta = pd.read_csv(gz)
1 
2 
3 
4 
5 
df_meta.iloc[4, :]
run_id                                                                  5
starting_date                                                  2009-02-18
last_day_sims                                                  2010-02-17
n_runs                                                                100
fraction_susceptible    [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...
                                              ...                        
npi_multipliers_rule                                                  min
frac_pops                                                             NaN
ref_compartments                                                     [""]
avg_time_valid_runs                                            1877.09113
avg_time_all_runs                                              1877.09113
Name: 4, Length: 85, dtype: object
df_sum = df.drop(['t'], axis=1).groupby(['date', 'country_id', 'run_id']).sum()
# get only 1 country's data
country =  0
df_country = df_sum.loc[(slice(None), country), :]
df_country = df_country.droplevel('country_id').T.sum().reset_index()
# pivoting data. god what a good function.
df_pivot = df_country.reset_index().pivot(index='date', columns='run_id', values=0).fillna(0)

# zero-indexing run_id because we aren't barbarians
df_pivot.columns = df_pivot.columns - 1 
df_pivot
run_id 0 1 2 3 4 5 6 7 8 9 ... 90 91 92 93 94 95 96 97 98 99
date
2009-02-17 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2009-02-18 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2009-02-19 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2009-02-20 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2009-02-21 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2010-02-13 54 0 11 8 12 0 31 33 17 19 ... 0 29 99 25 14 0 0 0 0 25
2010-02-14 47 0 8 14 10 0 18 29 16 10 ... 0 33 70 9 17 0 0 0 0 11
2010-02-15 49 0 9 17 24 0 27 29 23 14 ... 0 33 77 15 14 0 0 0 0 21
2010-02-16 56 0 4 22 13 0 16 31 10 11 ... 0 28 60 21 21 0 0 0 0 16
2010-02-17 42 0 6 22 10 0 19 30 18 14 ... 0 24 54 15 19 0 0 0 0 15

366 rows × 100 columns

fig = px.line(df_pivot,
              labels={
                     "value": "Cases? Hospitalizations?",
                     "date": "Date",
            },
            title="2009/10 Incidence Spaghetti Plot")
fig.show()
C:\Users\elija\anaconda3\Lib\site-packages\plotly\express\_core.py:1222: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  df_output[col_name] = to_unindexed_series(df_input[argument])

And just for fun…#

It’s not really for fun because, you know, it’s my job, but whatever. It’s fun too.

from curvestat import CurveBoxPlot
from curvestat import LoadRisk
from time import time
import numpy as np
import plotly.graph_objs as go
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances
from matplotlib.colors import to_rgba
from collections import defaultdict
# Array of time steps
time_arr = np.arange(0,len(df_pivot),1)
time_arr;
start = time()
Boxplot = CurveBoxPlot(curves=df_pivot,sample_curves=10,sample_repititions=40,time=time_arr)

# Choose ranking
rank_allornothing=Boxplot.rank_allornothing()
rank_max = Boxplot.rank_max()

# Get envelope with this choice of ranking
boundaries = Boxplot.get_boundaries(rank_max,percentiles=[50,90])

# ... HEATMAP : 
# First, define which curves we want the heatmap for
heatmapcurves = list(boundaries['curve-based'][50]['curves'])

# Then find the heatmap
heatmap_50 = Boxplot.get_peakheatmap(heatmap_curves=heatmapcurves)

# Plot results
Boxplot.plot_everything()

end = time()
diff = end - start
print(f"Ran {len(df_pivot)} curves in {round(diff, 2)} seconds")
Ran 366 curves in 1.98 seconds
_images/540b3108ee7cb1d6da3e7dcacca7d859d9239e08a48ed912da6a72ba53f4bbb8.png
start = time()
# unfortunately, pairwise comparisons are O(n^2)
curve_diff = dict()

for first_curve in df_pivot.columns:
    for second_curve in df_pivot.columns[first_curve+1:]:
        curve_diff[(first_curve, second_curve)] = \
        ((df_pivot[first_curve] - df_pivot[second_curve])).abs().sum()
end = time()
print(f'pairwise compared {len(df_pivot)} curves in {round(end-start, 4)} seconds')
pairwise compared 366 curves in 0.953 seconds
# Get all unique elements
elements = sorted(set(i for pair in curve_diff.keys() for i in pair))
n = len(elements)

# Create the distance matrix
distance_matrix = np.zeros((n, n))
for (i, j), score in curve_diff.items():
    distance_matrix[i, j] = score
    distance_matrix[j, i] = score  # Because the matrix is symmetric

# Convert distance matrix to similarity matrix
similarity_matrix = np.max(distance_matrix) - distance_matrix

# Fit K-Means clustering
k = 3  # Change this to the number of desired clusters
kmeans = KMeans(n_clusters=k)
labels = kmeans.fit_predict(similarity_matrix)
C:\Users\elija\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning:

The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning

C:\Users\elija\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning:

KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
colors = ['red', 'blue', 'green', 'gray', 'cyan']
cmap = dict(zip(range(len(colors)), colors))
traces = []
# Plot each curve with its respective color based on the group
for column, group in zip(df_pivot.columns, labels):
    trace = go.Scatter(
        x=df_pivot.index,
        y=df_pivot[column],
        mode='lines',
        name=f'Curve {column}',
        line=dict(color=cmap[group])
    )
    traces.append(trace)

fig = go.Figure(traces)
    
# Update layout
fig.update_layout(
    title='Grouped Incidence',
    xaxis_title='Date',
    yaxis_title='Cases? ',
    showlegend=False
)

# Show the plot
fig.show()
di = defaultdict(lambda: defaultdict())
for label in np.unique(labels):
    di[label]['data'] = df_pivot[[i for i in df_pivot.columns if labels[i] == label]]
    
    # create boxplot instance, add curves
    Boxplot = CurveBoxPlot(curves=di[label]['data'],sample_curves=5,sample_repititions=200,time=time_arr)

    # rank curves
    rank_allornothing=Boxplot.rank_allornothing()
    boundaries = Boxplot.get_boundaries(rank_allornothing,percentiles=[0, 50,90])
    
    di[label]['boundaries'] = boundaries
fig = go.Figure()

for group in di:

        fig.add_trace(go.Scatter(
            name=f'Lower Quartile {group}',
            x=time_arr,
            y=di[group]['boundaries']['curve-based'][50]['min-boundary'],
            marker=dict(color="#444"),
            line=dict(width=0),
            mode='lines',
            showlegend=False))
        
        fig.add_trace(go.Scatter(
            name=f'Upper Quartile {group}',
            x=time_arr,
            y=di[group]['boundaries']['curve-based'][50]['max-boundary'],
            mode='lines',
            marker=dict(color="#444"),
            line=dict(width=0),
            fillcolor='rgba' + str(to_rgba(cmap[group], alpha = .1)),
            fill='tonexty',
            showlegend=False
        ))
        
        fig.add_trace(go.Scatter(
            name=f'Median {group}',
            x=time_arr,
            y=df_pivot[di[group]['boundaries']['curve-based'][0]['curves'][0]],
            marker=dict(color=cmap[group]),
            line=dict(width=1),
            mode='lines',
            showlegend=False
        ))

fig.update_layout(
    title='Middle 90% Curves of For Each Group',
    xaxis_title='Day',
    yaxis_title='Cases',
    showlegend=False
)        
fig.show()
centrality = {curve: 0 for curve in set(curve for pair in curve_diff.keys() for curve in pair)}

# Compute centrality scores
for (curve1, curve2), diff in curve_diff.items():
    centrality[curve1] += diff
    centrality[curve2] += diff

# Convert centrality scores to a DataFrame for easy sorting
centrality_df = pd.DataFrame(list(centrality.items()), columns=['Curve', 'Centrality'])
centrality_df['Centrality'] = 1 - (centrality_df['Centrality']-centrality_df['Centrality'].min())/(centrality_df['Centrality'].max()-centrality_df['Centrality'].min())

centrality_df['Group'] = labels
centrality_df.head()
Curve Centrality Group
0 0 0.792745 0
1 1 0.449300 2
2 2 0.628479 1
3 3 0.646675 1
4 4 0.832616 1
# Rank curves from most to least central, reminder that this is done by summing the area between all curves
ranked_curves = centrality_df.sort_values(by='Centrality', ascending=False).reset_index(drop=True)
ranked_curves
Curve Centrality Group
0 83 1.000000 1
1 32 0.997673 1
2 9 0.992977 1
3 48 0.991864 1
4 86 0.989714 1
... ... ... ...
95 92 0.363995 0
96 79 0.331418 0
97 65 0.314794 0
98 27 0.066969 1
99 71 0.000000 0

100 rows × 3 columns

px.histogram(x=ranked_curves['Centrality'], color=ranked_curves['Group'])
df = df_pivot.T.quantile([0, .10, .25, .75, .90, 1]).T
df['Median'] = df_pivot.T.median()
df['Mean'] = df_pivot.T.mean()

px.scatter(df, title='The Problem with Curve Box Plots')

This graph illustrates the issue we face. There are times where the 75th percentile points look as though they are well above the 90th percentile points, which obviously isn’t right. curvestat gets around this problem by basically just widening the confidence interval: now the 75% interval looks like the 90% interval. That doesn’t help us though. At that point, we’re deviating from the conventional use of statistics, and while I’m a big fan of breaking conventions, it is counter-productive with regards to our goal of making these graphs more legible.

curvestat’s use of sampling also means that their package is not strictly deterministic, which is a big issue as well. This is rectified with my “area under the curve” method.

# one week rolling maximum for each run
df_roll = df_pivot.rolling(7).max().dropna()

# same as above
df = df_roll.T.quantile([0, .10, .25, .75, .90, 1]).T
df['Median'] = df_roll.T.median()
df['Mean'] = df_roll.T.mean()
px.scatter(df, title='The Problem with Curve Box Plots: Rollling Average')
(df_pivot.max() > 1200).astype(int).mean()
0.52

This seems to suggest that ~90% of the peaks are below 1200. Alas, 52% of them are.

px.histogram(df_pivot.max(), nbins=20)